Aim

Expanding the work. Find a publicly available data set and apply the same workflow and adapt some of the code to make it work.

Introduction

To look for new scRNA-seq data the 10xgenomics homepage was used. The data-set contains Human peripheral blood mononuclear cells (PBMCs) of a healthy female donor (age: 25-30). For library generation ~16,000 cells (11,996 cells recovered) were sequenced on an Illumina NovaSeq 6000 to a read depth of approximately 40,000 mean reads per cell. The summary shows that more than 40,000 reads per cell and app. 2000 genes per cell (median) could be detected. The quality scores for sequencing and mapping show satisfactory results.

Load libraries

###################
#load libraries
###################
library(dplyr)
## 
## Attache Paket: 'dplyr'
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     intersect, setdiff, setequal, union
library(Seurat)
## Attaching SeuratObject
library(patchwork)
library(ggplot2)

Setting up the Seurat Object

############################
#Setup the Seurat Object
############################

# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "./data_new/002/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc10k", min.cells = 3, min.features = 200)
pbmc
## An object of class Seurat 
## 22302 features across 11922 samples within 1 assay 
## Active assay: RNA (22302 features, 0 variable features)

Pre-processing

Quality control

Visualize QC metrics as a violin plot * The metrics showed a different distribution compared to the tutorial. The cutoff values will therefore have to be adjusted

# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

Similar to the tutorial, there is a strong correlation between nCount and nFeature

# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

Filtering

Based on the violin plots, the following criteria are used: * 500 < nFeature_RNA < 5000 * percent.mt < 15 %

pbmc <- subset(pbmc, subset = nFeature_RNA > 500 & nFeature_RNA < 5000 & percent.mt < 15)

Normalizing the data

pbmc <- NormalizeData(pbmc)

Identification of highly variable features (feature selection)

pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2

Scaling the data

all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix

Dimensionality reduction

PCA

pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1 
## Positive:  FGL2, FCN1, LYZ, CST3, CTSS, MNDA, TYMP, CYBB, SERPINA1, IFI30 
##     PSAP, NCF2, LST1, S100A9, TYROBP, AIF1, MPEG1, CSTA, TNFSF13B, FTL 
##     TMEM176B, DUSP6, CD68, FOS, GRN, MS4A6A, DUSP1, S100A8, AC020656.1, SPI1 
## Negative:  LTB, CD3E, IL32, CD3D, TRAC, CD3G, PCED1B-AS1, TRBC2, ETS1, BCL11B 
##     MALAT1, IL7R, ARL4C, CD2, LIME1, TCF7, CD7, LINC00861, PRKCQ-AS1, CD27 
##     PIK3IP1, CCR7, FCMR, CD247, TRBC1, GZMM, LEF1, NOSIP, ISG20, IKZF3 
## PC_ 2 
## Positive:  IL32, CD3E, CD3D, GZMM, CD3G, CD2, ANXA1, CTSW, CD247, CD7 
##     BCL11B, TRAC, S100A4, GZMA, NKG7, CST7, LINC00861, PRF1, IL7R, TMSB4X 
##     KLRK1, GNLY, CCL5, ARL4C, KLRD1, S100A10, SAMD3, TRBC1, ID2, PRKCQ-AS1 
## Negative:  MS4A1, CD79A, IGHM, BANK1, CD79B, LINC00926, RALGPS2, TNFRSF13C, NIBAN3, IGHD 
##     BCL11A, SPIB, IGKC, CD22, HLA-DQA1, AFF3, LINC02397, PAX5, FCRL1, VPREB3 
##     BLK, FCER2, CD74, COBLL1, HLA-DRA, TCF4, BLNK, SWAP70, HLA-DRB1, GNG7 
## PC_ 3 
## Positive:  CCR7, LEF1, IL7R, TCF7, PIK3IP1, MAL, RCAN3, TRABD2A, NOSIP, LTB 
##     CD27, CAMK4, OXNAD1, PRKCQ-AS1, PDE3B, TRAC, CHRM3-AS2, FHIT, LRRN3, BCL11B 
##     PASK, NELL2, S100A12, RGCC, TRAT1, CD3G, VCAN, ADTRP, S100A8, CD3D 
## Negative:  GZMB, NKG7, GNLY, KLRD1, CST7, PRF1, CLIC3, GZMA, FGFBP2, KLRF1 
##     CCL4, HOPX, SPON2, GZMH, CCL5, ADGRG1, C12orf75, PTGDR, CD160, FCGR3A 
##     TBX21, XCL2, CTSW, MATK, IL2RB, PLEK, TRDC, CTSC, LAIR2, S1PR5 
## PC_ 4 
## Positive:  CD79B, MS4A1, FCGR3A, CD79A, LINC00926, GNLY, IGHD, TNFRSF13C, BANK1, KLRD1 
##     FCER2, NKG7, CD22, CCL4, FGFBP2, PRF1, FCRL1, CST7, PAX5, MTSS1 
##     FCRL3, RALGPS2, SWAP70, VPREB3, IFITM2, HOPX, KLRF1, GZMA, LINC02397, GZMH 
## Negative:  LILRA4, CLEC4C, SERPINF1, LRRC26, TPM2, SCT, MAP1A, DNASE1L3, EPHB1, LINC00996 
##     LAMP5, ITM2C, PTCRA, TNFRSF21, PACSIN1, CIB2, SMPD3, SCAMP5, PHEX, PLD4 
##     SCN9A, PLEKHD1, DERL3, IL3RA, ZFAT, NRP1, AC097375.1, SLC35F3, GAS6, SMIM5 
## PC_ 5 
## Positive:  CDKN1C, HES4, TCF7L2, CTSL, BATF3, CSF1R, FCGR3A, MS4A7, CKB, SIGLEC10 
##     CASP5, RRAS, IFITM3, AC104809.2, MS4A4A, MTSS1, CALML4, NAP1L1, NEURL1, SMIM25 
##     AC064805.1, CAMK1, RHOC, ABI3, GPBAR1, HMOX1, ZNF703, RNASET2, FAM110A, HCAR3 
## Negative:  S100A12, SLC2A3, VCAN, ITGAM, CYP1B1, PADI4, S100A8, CES1, MEGF9, MGST1 
##     VNN2, QPCT, MCEMP1, THBS1, CD14, RBP7, CTSD, CR1, AC020916.1, BST1 
##     MARC1, CKAP4, PLBD1, ANXA1, F5, ALOX5AP, CRISPLD2, RNASE2, AC020656.1, HMGB2
# Examine and visualize PCA results a few different ways
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  FGL2, FCN1, LYZ, CST3, CTSS 
## Negative:  LTB, CD3E, IL32, CD3D, TRAC 
## PC_ 2 
## Positive:  IL32, CD3E, CD3D, GZMM, CD3G 
## Negative:  MS4A1, CD79A, IGHM, BANK1, CD79B 
## PC_ 3 
## Positive:  CCR7, LEF1, IL7R, TCF7, PIK3IP1 
## Negative:  GZMB, NKG7, GNLY, KLRD1, CST7 
## PC_ 4 
## Positive:  CD79B, MS4A1, FCGR3A, CD79A, LINC00926 
## Negative:  LILRA4, CLEC4C, SERPINF1, LRRC26, TPM2 
## PC_ 5 
## Positive:  CDKN1C, HES4, TCF7L2, CTSL, BATF3 
## Negative:  S100A12, SLC2A3, VCAN, ITGAM, CYP1B1
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")

DimPlot(pbmc, reduction = "pca")

DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc, dims = 1:9, cells = 500, balanced = TRUE)

Determine the ‘dimensionality’ of the dataset

Due to the larger dataset, more dimensions were required to capture the full variation of cells. The Jackstraw procedure and the elbow plot produce different results, and the elbow plot is difficult to interpret at a high number of PCs. Therefore, we chose to orient ourselves on the Jackstraw plot and “err on the high side”, and used 40 components for clustering.

pbmc <- JackStraw(pbmc, num.replicate = 100, dims = 50) # Takes a long time (~ 20 min on my PC)
pbmc <- ScoreJackStraw(pbmc, dims = 1:50)
JackStrawPlot(pbmc, dims = 1:40)
## Warning: Removed 60469 rows containing missing values (geom_point).

ElbowPlot(pbmc, ndims = 50)

# optionally save the current state of the data
#saveRDS(pbmc, file = "./pbmc_newdata.rds")

Cluster the cells

The resolution had to be adjusted to detect the different cell types and subtypes. We found that with a resolution of 0.1, the clusters represented basic cell types (T, B, Mono, etc.). To also capture the various subgroups of cells, a resolution of 1 was used. 21 different clusters were found, compared to 9 in the tutorial dataset.

pbmc <- FindNeighbors(pbmc, dims = 1:40)
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc, resolution = 1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 11482
## Number of edges: 465145
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8681
## Number of communities: 21
## Elapsed time: 1 seconds
# Look at cluster IDs of the first 5 cells
head(Idents(pbmc), 5)
## AAACCCAAGGCCCAAA-1 AAACCCAAGTAATACG-1 AAACCCAAGTCACACT-1 AAACCCACAAAGCGTG-1 
##                 10                  0                  5                  3 
## AAACCCACAATCGAAA-1 
##                  6 
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20

Run non-linear dimensional reduction (UMAP/tSNE)

The same number of dimensions as in clustering had to be used here.

pbmc <- RunUMAP(pbmc, dims = 1:40)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 19:51:26 UMAP embedding parameters a = 0.9922 b = 1.112
## 19:51:26 Read 11482 rows and found 40 numeric columns
## 19:51:26 Using Annoy for neighbor search, n_neighbors = 30
## 19:51:26 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 19:51:28 Writing NN index file to temp file C:\Users\Daniel\AppData\Local\Temp\RtmpuQca4G\file4348208a76d7
## 19:51:28 Searching Annoy index using 1 thread, search_k = 3000
## 19:51:31 Annoy recall = 100%
## 19:51:31 Commencing smooth kNN distance calibration using 1 thread
## 19:51:32 Initializing from normalized Laplacian + noise
## 19:51:33 Commencing optimization for 200 epochs, with 502384 positive edges
## 19:51:46 Optimization finished
# `label = TRUE` to help identify the clusters
DimPlot(pbmc, reduction = "umap", label = TRUE)

#run TSNE
pbmc <- RunTSNE(pbmc, dims = 1:40)
DimPlot(pbmc, reduction = "tsne", label = TRUE)

Overall, tSNE results in a better separation of clusters in our opinion. Clusters that were not fully separated using umap (clusters 0, 4, 12) were separate here.

Finding differentially expressed features (cluster biomarkers)

Find markers for every cluster compared to all remaining cells, report only the positive ones.

# find markers for every cluster compared to all remaining cells, report only the positive
# ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
## Calculating cluster 17
## Calculating cluster 18
## Calculating cluster 19
## Calculating cluster 20
pbmc.markers %>%
    group_by(cluster) %>%
    slice_max(n = 10, order_by = avg_log2FC) -> top10

top10

The ten most significant genes for each cluster are printed. By examining these genes, a first indication of the cell type making up some of the clusters can be gained.

0, 10, 12: Monocytes (CD14, FCGR3A) 1, 11: CD8+ T cells (CD8A, CD8B) 6, 7, 17, 19: B cells (IG…) 8: NK cells (NKG7) 18: Platelets (PPBP)

Cluster annotation

To conclusively determine the cell type of each cluster, we researched relevant marker genes for PBMCs. The following assignment is based on the last section of the Seurat tutorial as well as Wang et al., 2021 (https://doi.org/10.1038/s41467-021-25771-5)

First, we we determined the base cell type of each cluster. The following violin plots show the expression of at least one relevant marker gene for each type.

  • CD3D is a marker for all typed of T cells (CD4, CD8, etc.). 1, 2, 3, 9, 11, 14, 5 therefore likely represent T cells. Cluster 12 also shows a significant expression, tough it is lower than all other T cell clusters. This cluster is also not found in the vicinity of the remaining T cells in both uMap and tSNE.
  • NCAM1 is a marker gene for NK cells.
  • CD19 is a marker gene for B cells.
  • CD14 and FCGR3A (CD16) are marker genes for monocytes. Note that cluster 12 also shows a strong expression of CD14, making it more likely that this cluster is (mainly) made up of monocytes. This would also be consistent with the position of the cluster in the aforementioned plots.
  • CD1C is a marker gene for myeloid dendritic cells.
  • LILRA4 is a marker gene for plasmocytoid dendritic cells.
  • CD34 is a marker gene for progenitor/stem cells.
  • PPBP is a marker gene for platelets.
VlnPlot(pbmc, features = c("CD3D", "NCAM1", "CD19", "CD14", "FCGR3A", "CD1C", "LILRA4", "CD34", "PPBP"))

Next, we examined each cell type more closely to determine the subtypes of each cluster.

Monocytes

Monocytes are usually divided into two main group, CD14+ and CD16+ (FCGR3A), as well as an intermediate type.

  • Clusters 0 and 4 consist of CD14+ Monocytes.
  • Cluster 10 consists of CD16+ Monocytes.
  • Cluster 5 is made up of intermediate Monocytes.
  • Cluster 12 (which as mentioned before exhibits properies of both T cells and Monocytes), seems to be at least partly made up of intermediate Monocytes as well.
VlnPlot(pbmc, features = c("CD14", "FCGR3A"), idents = c(0,4,5,10,12))

B cells

Three marker genes were examined for B cells. MS4A1 is expressed by naive and memory B cells, but not by plasma cells. CD27 is expressed by memory B cells and plasma cells and CD38 is only expressed by plasma cells.

  • Cluster 6 represents memory B cells.
  • Cluster 7 consists of naive B cells.
  • Cluster 17 shows intermediate properties between naive and memory cells.
  • Cluster 19 consists of plasma cells.
VlnPlot(pbmc, features = c("MS4A1", "CD27", "CD38"), idents = c(6,7,17,19))

T cells

Six marker genes were considered for determine T cell subtypes. CD4 and CD8A are used to differentiate between the two main subgroups, CD4+ and CD8+ T cells. CCR7 and S100A4 are used to differentiate between naive (only CCR7), central memory (both CCR7 and S100A4) and effector memory cells (only S100A4). Additionally, FOXP3 is a marker gene for CD4+ regulatory T cells, while TRDC indicates gamma delta T cells.

  • Cluster 1 consists of CD8+ naive T cells.
  • Cluster 2 consists of CD4+ memory T cells.
  • Cluster 3 consists of CD4+ naive T cells.
  • Cluster 9 consists of CD8+ effector memory T cells.
  • Cluster 11 consists of CD8* central memory T cells.
  • Cluster 14 consists of gamma delta T cells.
  • Cluster 15 consists of CD4+ regulatory T cells.
VlnPlot(pbmc, features = c("CD4", "CD8A", "CCR7", "S100A4", "FOXP3", "TRDC"), idents = c(1,2,3,9,11,12,14,15))

To visualize the distribution of the main cell types, the expression level of relevant marker are shown in feature plots (for both UMAP and tSNE).

FeaturePlot(pbmc, features = c("CD3D", "NCAM1", "CD19", "CD14", "FCGR3A", "CD1C", "LILRA4", "CD34", "PPBP"))

FeaturePlot(pbmc, features = c("CD3D", "NCAM1", "CD19", "CD14", "FCGR3A", "CD1C", "LILRA4", "CD34", "PPBP"), reduction = "tsne")

Expression levels of the ten most significant genes for each cluster are also visualized as a heatmap. Similarities between clusters supports those clusters belonging to the same cell type, e.g. 0, 4, 5, 10, 12 for Monocytes and 1, 2, 3, 9, 11 for T cells.

pbmc.markers %>%
    group_by(cluster) %>%
    top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(pbmc, features = top10$gene) + NoLegend()

Assigning cell type identity to clusters

Finally, UMAP and tSNE plots were annotated with the previously determined cluster identities. As expected, cells of the same type are generally clustered closely together.

new.cluster.ids <- c("CD14+ Monocytes", 
                     "CD8+ naive T cells", 
                     "CD4+ memory T cells", 
                     "CD4+ naive T cells", 
                     "CD14+ Monocytes", 
                     "Intermediate Monocytes", 
                     "Memory B cells", 
                     "Naive B cells", 
                     "NK cells",
                     "CD8+ effector memory T cells",
                     "CD16+ Monocytes",
                     "CD8+ central memory T cells",
                     "Unknown, likely Monocytes",
                     "Myeloid dendritic cells",
                     "Gamma delta T cells",
                     "CD4+ regulatory T cells",
                     "Plasmocytoid dentritic cells",
                     "Intermediate B cells",
                     "Platelets",
                     "Plasma cells",
                     "Hematopoietic stem cells")

names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
dimplot1 <- DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5, repel = TRUE, label.box = TRUE) + NoLegend()

dimplot1$layers[[3]] <- unserialize(serialize(dimplot1$layers[[2]], NULL))

dimplot1$layers[[2]]$geom_params$seed <- 1234
dimplot1$layers[[3]]$geom_params$seed <- 1234

dimplot1$layers[[2]]$aes_params$alpha <- 0.5
dimplot1$layers[[3]]$aes_params$fill <- NA

dimplot1

dimplot2 <- DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = 0.5, repel = TRUE, label.box = TRUE) + NoLegend()

dimplot2$layers[[3]] <- unserialize(serialize(dimplot2$layers[[2]], NULL))

dimplot2$layers[[2]]$geom_params$seed <- 1234
dimplot2$layers[[3]]$geom_params$seed <- 1234

dimplot2$layers[[2]]$aes_params$alpha <- 0.5
dimplot2$layers[[3]]$aes_params$fill <- NA

dimplot2